Setup

knitr::opts_chunk$set(echo=TRUE)
knitr::opts_chunk$set(warning=FALSE, message=FALSE)

# Load libraries
# library(ggchromatic)
library(scater)
library(ggpubr)
library(igraph)
library(dittoSeq)
library(Rphenograph)
library(aricode)
library(viridis)
library(ggrepel)
library(LaplacesDemon)
library(geiger)
library(DescTools)
library(RColorBrewer)
# Set general input paths to all analysis files
analysis.path <- "/mnt/bb_dqbm_volume/analysis"

# Set path to masks
tonsil.path <- "/mnt/bb_dqbm_volume/data/Tonsil_th152"
lung.path <- "/mnt/bb_dqbm_volume/data/Immucan_lung"
# Import helper fncs
source("../analysis/helpers/helper_fnc.R")

Read Inputs

first, we look at U computed using different training datasets. For this, we read in the results from CISI training on the Tonsil th152 and the Immucan lung dataset (tonsil_vs_lung.py).

## Read results
# Read in all results for tonsil and lung comparison into one dataframe
files <- list.files(analysis.path, "simulation_results.txt",
                    full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
files <- files[grepl(
  "tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", files)]
res <- lapply(files, read_results, type="res") %>% 
  bind_rows() 

# Harmonize all dataset names
patterns <- unique(unlist(lapply(res$training, function(name){
  if (length(str_split(name, "_")[[1]])==1){
    name
  }
})))

res$training <- unlist(lapply(res$training, function(pat){
  patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))
res$dataset <- unlist(lapply(res$dataset, function(pat){
  patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))

## Read U
u.files <- list.files(analysis.path, "gene_modules.csv",
                      full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
u.files <- u.files[grepl(
  "tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", u.files)]
u <- lapply(u.files, read_U, type="training") %>% 
  bind_rows() %>%
  dplyr::rename(dataset=rep)

## Read X_test and X_simulated
# Specify X_test and X_simulated files to be read in
X.files <- list.files(analysis.path, "X_", full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
X.files <- X.files[grepl(
  "tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", X.files)]
X.files <- X.files[!grepl("_processed", X.files)]

# Read processed SCE to avoid computing UMAP, TSNE, ... all the time since it is computationally
# expensive
# X.files <- X.files[grepl("_processed", X.files)]

# Read in all sce experiments from saved anndata and save additional info in metadata
# (e.g. which dataset is used in training and testing, which k was used and if it
# is the ground truth or simulated X)
# Remove neg. values and transform counts?
X.list <- lapply(X.files, read_results, type="x")
X.list <- lapply(X.list, function(sce.temp){
  pat <- metadata(sce.temp)
  metadata(sce.temp)$dataset <- patterns[str_detect(pat$dataset, 
                                                    fixed(patterns, ignore_case=TRUE))]
  metadata(sce.temp)$training <- patterns[str_detect(pat$training,
                                                     fixed(patterns, ignore_case=TRUE))]
  assay.name <- names(assays(sce.temp))
  for (a in assay.name[-1]) {
    assay(sce.temp, a) <- NULL
  }
  assay(sce.temp, "exprs") <- asinh(counts(sce.temp)/1)
  
  sce.temp
})

# Add reduced dimensions
set.seed(220225)
X.list <- lapply(X.list, function(sce){
  sce <- runUMAP(sce, exprs_values="exprs") 
  sce <- runTSNE(sce, exprs_values="exprs")
  sce
})

# Save SCE to avoid computing UMAP, TSNE, ... all the time since it is computationally
# expensive
# lapply(1:(length(X.list)), function(i){saveRDS(X.list[i], 
#                                                paste0(gsub("\\..*", "", X.files[i]),
#                                                       "_processed.rds"))})

## Read masks
# Read in masks for tonsil data
masks.tonsil <- loadImages(file.path(tonsil.path, "steinbock/masks_deepcell"), as.is=TRUE)
mcols(masks.tonsil) <- DataFrame(sample_id=names(masks.tonsil))

# Read in masks for lung data
masks.lung <- loadImages(file.path(lung.path, "masks"), as.is=TRUE)
mcols(masks.lung) <- DataFrame(sample_id=names(masks.lung))

## Read original H5ad SCE objects
tonsil.original <- readH5AD(file.path(tonsil.path, "preprocessed_data/spe.h5ad"))
lung.original <- readH5AD(file.path(lung.path, "Lung_sce.h5ad"))
original.list <- list(tonsil.original, lung.original)
names(original.list) <- c("tonsil_tonsil", "lung_lung")

Plot defaults

# Define celltype colours used by the rest of the script
celltype.col <- c(brewer.pal(12, "Paired"),
                           brewer.pal(8, "Pastel2")[-c(3,5,8)],
                           brewer.pal(12, "Set3")[-c(2,3,8,9,11,12)])
celltype.col <- c(celltype.col[1:length(unique(colData(lung.original)$celltype))],
                  "black")
names(celltype.col) <- c(as.character(unique(colData(lung.original)$celltype)), NA)
celltype.col <-  celltype.col[colData(lung.original) %>% 
                                as.data.frame() %>%
                                dplyr::pull(celltype) %>% 
                                levels]
# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(tonsil.original)
rm(lung.original)

CISI results

Results

# For each different measurement of training results, plot a barplot comparing
# diff. k-sparsities, simulation types and training-test set combinations
for (measure in names(res)[2:10]) {
  cat("##### ", measure, "\n")
  
  # Reorder dataframe for plotting
  data_temp <- res %>%
    dplyr::select(dataset, training, simulation, k, !!measure) %>%
    mutate(group=paste(training, dataset, sep="_"))
  
  # Create barplot
  results.barplot <- plot_cisi_results(data_temp, "group", measure, "k")
  
  print(results.barplot)
  cat("\n\n")
}
Overall pearson

Overall spearman

Gene average

Sample average

Sample dist pearson

Sample dist spearman

Gene dist pearson

Gene dist spearman

Matrix coherence (90th ptile)

U

Heatmaps of U

cor <- plot_U(u, "k", "dataset")
k = 1

k = 3

Module Comparisons

plot_U_membership(u, "k", "dataset")
k = 1

k = 3

Mantel Test between U’s

# Calculate mantel test between U's of tonsil and lung for all k's
mantel <- lapply(cor, function(l){
  mantel_test(l[[1]], l[[2]])$mantel_r
  }) %>%
  as.data.frame(col.names=names(cor), check.names=FALSE)

knitr::kable(mantel, digits=2,
             col.names=paste0("k=", names(mantel)))
k=1 k=3
0.66 0.51

Image results

SMA of Test Image (Tonsil th152, k=1)

# Subset to training and test of tonsil data
X.tonsil1 <- keep(X.list, function(x){
  metadata(x)$training=="tonsil" & metadata(x)$dataset=="tonsil" & metadata(x)$k=="1"})
names(X.tonsil1) <- lapply(X.tonsil1, 
                            function(x){metadata(x)$ground_truth})
# Call plot_cells to get individual plots for each roi for decomposed and true
# results
poi <- "SMA"
X.imgList <- plot_cells(X.tonsil1, masks.tonsil, 
                        poi)
# Plot decomposed vs true results for test roi (002)
X.img <- plot_grid(plotlist=append(X.imgList[grepl("20220520_TsH_th152_cisi1_002",
                                                   names(X.imgList))],
                                   X.imgList[grepl("legend",
                                                   names(X.imgList))]),
                   ncol=2, labels=names(X.tonsil1),
                   label_size=15, hjust=c(-2, -1.5), 
                   vjust=1, scale=0.9)
X.img

CD20 of Test Image Overlayed (Tonsil th152, k=1)

# Rename list of tonsil data for nicer titles in plotting
X.tonsil1Renamed <- lapply(names(X.tonsil1), function(n){
  rownames(X.tonsil1[[n]]) <- paste(rownames(X.tonsil1[[n]]),
                                     n, sep="\n")
  reducedDims(X.tonsil1[[n]]) <- NULL
  X.tonsil1[[n]]
})
names(X.tonsil1Renamed) <- names(X.tonsil1)
# Add decomposed and true dataset of tonsil into one SCE
X.overlayed <- do.call("rbind", X.tonsil1Renamed)

# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(X.tonsil1)

# Define protein of interest
pois <- c("CD20\nsimulated", "CD20\nground_truth")

# Plot cells coloured according to decomposed and true poi values 
# (if similar in both should have a mixture of colours, e.g. orange) 
X.imgDiff <- plotCells(mask=masks.tonsil[unique(colData(X.overlayed)$sample_id)], 
                       object=X.overlayed,
                       cell_id="ObjectNumber", img_id="sample_id", colour_by=pois,
                       return_plot=TRUE,  image_title=list(cex=1),
                       scale_bar=list(cex=1, lwidth=5),
                       legend=list(colour_by.title.cex=0.7, colour_by.labels.cex=0.7))

# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(X.tonsil1Renamed)

# Draw plot
ggdraw(X.imgDiff$plot)

Counts results

Scatterplots of arcsinh Transformed Counts (Immucan Lung)

Plot of arcsinh transformed counts coloured by defined celltype.

# Subset to dataset trained and tested on the Immucan lung dataset
X.lung_lung <- keep(X.list, function(x){
  metadata(x)$training=="lung" & metadata(x)$dataset=="lung"})
names(X.lung_lung) <- lapply(X.lung_lung,
                             function(x){metadata(x)$ground_truth})
k_s <- unique(unlist(lapply(X.lung_lung, function(sce_temp){metadata(sce_temp)$k})))

for (k in k_s) {
  cat("##### k =", k, "\n")
  
  # Subset for k of interest
  X.lung_lungK <- keep(X.lung_lung, function(x){metadata(x)$k==k})
  
  # Plot asinh transformed counts of proteins of interest of decomposed and true 
  # datasets coloured by different celltypes
  X.Exprs <- plot_grid(plot_exprs(X.lung_lungK, "B", "CD3", "CD20"),
                       plot_exprs(X.lung_lungK, "BnT", "CD3", "CD20"),
                       plot_exprs(X.lung_lungK, "Neutro", "MPO", "CD15"),
                       plot_exprs(X.lung_lungK, "Neutro", "CD3", "MPO"),
                       plot_exprs(X.lung_lungK, "DC", "CD11c", "CD68"),
                       ncol=1)
  print(X.Exprs)
  cat("\n\n")
}
k = 1

k = 3

Per protein results

Results per Protein (k=1, arcsinh)

Scatterplot of ground truth vs decomposed results per protein for k=1 and asinh transformed counts.

# Add all X_test counts together into dataframe for easier plotting
aoi <- "exprs"
X.df <- lapply(X.list, function(sce){
  counts.long <- as.data.frame(assays(sce)[[aoi]]) %>%
    mutate(protein=rownames(.)) %>%
    melt() %>%
    dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
                  cell=variable) %>%
    mutate(k=metadata(sce)$k,
           dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
  
}) %>% bind_rows() %>%
  group_by(protein, cell, k, dataset) %>%
  summarise_all(na.omit)

# For each test/training dataset combination plot for each true vs decomposed
# results in scatter plot and add diagonal and regression line, as well as
# R (pearson correlation coefficient)
for (i in unique(X.df$dataset)) {
  cat("#####", i, "\n")
  
  proteinPlot <- ggscatter(X.df %>%
                             dplyr::filter(k=="1" & dataset==i), 
                           x="ground_truth", y="simulated",
                           add="reg.line",
                           color=pal_npg("nrc")("1"),
                           add.params=list(color=pal_npg("nrc")("4")[4],
                                           size=2)) +
    facet_wrap(~ protein, scales="free", ncol=5) +
    theme_cowplot(10) +
    geom_abline(slope=1, linetype="dashed") +
    stat_cor(size=2)
  
  print(proteinPlot)
  cat("\n\n")
}
lung_lung

tonsil_lung

lung_tonsil

tonsil_tonsil

# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(X.df)

Correlations per Protein (k=1, arcsinh)

Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts.

aoi <- "exprs"
# Calculate correlations between ground truth and simulated data for each protein
X.cor <- lapply(keep(X.list, function(sce){
  metadata(sce)$dataset==metadata(sce)$training
}), function(sce){
  counts.long <- as.data.frame(assays(sce)[[aoi]]) %>%
    mutate(protein=rownames(.)) %>%
    melt() %>%
    dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
                  cell=variable) %>%
    mutate(k=metadata(sce)$k,
           dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
  
}) %>% bind_rows() %>%
  group_by(protein, cell, k, dataset) %>%
  summarise_all(na.omit) %>%
  group_by(k, dataset, protein) %>%
  summarize(correlation=cor(ground_truth, simulated)) %>%
  dplyr::filter(k=="1") %>%
  dplyr::select(-k)

# Read in original SCE with all samples and proteins and without paper normalization
# of counts/exprs and compute statistics like mean, median, var,... on these
# Add this to above correlation dataframe
aoi.original <- "exprs"
X.cor <- lapply(names(original.list), function(n){
  sce <- original.list[[n]]
  counts.long <- as.data.frame(assays(sce)[[aoi.original]]) %>%
    mutate(protein=rownames(.)) %>%
    melt() %>%
    dplyr::rename(ground_truth=value,
                  cell=variable) %>%
    dplyr::filter(protein %in% unique(X.cor$protein)) %>%
    group_by(protein) %>%
    mutate(median=median(ground_truth),
           mean=mean(ground_truth),
           var=var(ground_truth),
           mode_pos=Modes(ground_truth, min.size=0.1)$modes[1],
           dataset=n) 
  
  counts.long
}) %>% bind_rows() %>%
  merge(X.cor) %>%
  ungroup() %>%
  rowwise() %>%
  mutate("mean-median"=mean-median)

# Plot correlations for k=1 for each test/training dataset combination
for (i in unique(X.cor$dataset)) {
  cat("#####", i, "\n")
  
  proteinCorr <- plot_protein_cor(X.cor %>% 
                                    dplyr::filter(dataset==i))
  
  print(proteinCorr)
  cat("\n\n")
}
lung_lung

tonsil_tonsil

Protein distributions coloured by correlation results (k=1, arcsinh)

For a manual inspection of different protein distribution characteristics, we plot the boxplots of the original protein expressions coloured by correlation results. These show: Interquartile range (IQR), 1st and 3rd quantiles, median, mean, max, min and outliers.

# # Have a look at dittoRidgePlots of some good/bad performing proteins for lung and
# # tonsil
# tonsil.originalSubset <- tonsil.original
# colData(tonsil.originalSubset)$dummy <- "1"
# for (p in c("MPO", "Ki-67", "CD15", "CD8a", "CD45RO", "CD303", "PD-L1", "SMA", 
#             "TCF1/TCF7", "CD27", "CD14")){
#   print(dittoRidgePlot(tonsil.originalSubset, var=p, group.by="dummy", assay="exprs") +
#           ggtitle(p) +
#           coord_cartesian(xlim=c(0, 6.2), expand=TRUE))
# }
# lung.originalSubset <- lung.original
# colData(lung.originalSubset)$dummy <- "1"
# for (p in c("MPO", "Ki-67", "CD20", "PD-L1", "PD-1", "CD14")){
#   print(dittoRidgePlot(lung.originalSubset, var=p, group.by="dummy", assay="exprs") +
#           ggtitle(p))
# }

# Plot boxplots of protein expr per dataset coloured by protein correlation results
# (performance)
proteinDist <- plot_protein_dist(X.cor)

print(proteinDist)

Correlations per Protein vs Protein Characteristics (k=1, arcsinh)

Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts compared to protein characteristics of ground truth: Median, var, mean and mode_pos.

# Remove indidvidual cell levels to make data frame smaller bc of storage
X.cor <- X.cor %>%
  dplyr::select(-c("cell", "ground_truth")) %>%
  ungroup() %>%
  distinct()

# Define characteristics of interest
coi <- c("median", "var", "mean", "mode_pos", "mean-median")

# Plot correlations for k=1 for each test/training dataset combination
for (i in coi) {
  cat("#####", i, "\n")
  
  proteinChar <- ggplot(X.cor, 
                        aes(x=!!sym(i), y=correlation, label=protein)) +
    geom_point(color=pal_npg("nrc")("1")) +
    facet_wrap(~ dataset, scales="free", ncol=2) +
    theme_cowplot(10) +
    stat_cor(size=2.8,
             label.x.npc="center",
             label.y.npc="bottom") +
    stat_smooth(method="lm",
                formula=y ~ log(x),
                se=FALSE,
                color=pal_npg("nrc")("4")[4], size=2) +
    geom_text_repel(size=2.8)
  
  print(proteinChar)
  cat("\n\n")
}
median

var

mean

mode_pos

mean-median

# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(X.cor)

Cumulative sum of expression values (k=1)

Another way to analyse the protein-wise results instead of the correlation is shown in the subsequent part. In this case, cells are ranked according to their expression in the ground truth data for each protein and we compute the cumulative sum. Then, we plot this against the cumulative sum of the simulated data where the cell order is determined by the expression values in the ground truth data. The results are shown for k=1.

# Collect all counts for each dataset, rank according to the counts and calculate
# the cumulative sum
# Calculate area between curves of ground_truth and simulated
cumsum.df <- lapply(X.list, function(sce){
  counts.long <- as.data.frame(assays(sce)[["counts"]]) %>%
    mutate(protein=rownames(.)) %>%
    melt() %>%
    group_by(protein) %>%
    dplyr::arrange(desc(value)) %>%
    mutate(cumsum=cumsum(value),
           x=(0:(length(cumsum)-1))) %>%
    dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
                  cell=variable,
                  !!as.symbol(paste(metadata(sce)$ground_truth, "cumsum", sep="_")):=cumsum,
                  !!as.symbol(paste(metadata(sce)$ground_truth, "x", sep="_")):=x) %>%
    mutate(k=metadata(sce)$k,
           dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
  
  counts.long
}) %>% bind_rows() %>%
  group_by(protein, cell, k, dataset) %>%
  summarise_all(na.omit) %>%
  dplyr::select(-simulated_x) %>%
  dplyr::rename(x=ground_truth_x) %>%
  group_by(k, dataset, protein) %>% 
  dplyr::arrange(x) %>% 
  mutate(simulated_cumsum=cumsum(simulated)) %>%
  rowwise() %>%
  mutate(temp_max=max(simulated_cumsum, ground_truth_cumsum),
         temp_min=min(simulated_cumsum, ground_truth_cumsum)) %>%
  ungroup() %>%
  group_by(k, dataset, protein) %>%
  mutate(auc=geiger:::.area.between.curves(x, temp_min, temp_max,
                                           xrange=c(0, max(x)))) %>%
  dplyr::select(-c("temp_max", "temp_min"))


# Plot cumulative sums for k=1 for each test/training dataset combination
for (i in unique(cumsum.df$dataset)) {
  cat("#####", i, "\n")
  
  protein.cumsum <- ggplot(cumsum.df %>% 
                          dplyr::filter(k=="1" & dataset==i)) +
    geom_step(aes(x=x, y=simulated_cumsum, color="simulated")) +
    geom_step(aes(x=x, y=ground_truth_cumsum, color="ground_truth")) +
    facet_wrap(~protein, scales="free") +
    theme_cowplot(10) + 
    ylab("Cumulative sum of counts") +
    xlab("Ordered cells") +
    scale_colour_manual("", 
                        breaks=c("simulated", "ground_truth"),
                        values=c(simulated=pal_npg("nrc")("1"),
                                 ground_truth=pal_npg("nrc")("2")[2]))
  
  print(protein.cumsum)
  cat("\n\n")
}
lung_lung

tonsil_lung

lung_tonsil

tonsil_tonsil

Area between sumulative sums of expression values (k=1)

To summarize the above analysis, the area between the two lines (of the cumulative sum) is computed for each protein and dataset. Ideally, the lines match perfectly (overlap) and the area would be 0.

# Plot auc of cumulative sums for k=1 for each test/training dataset combination
for (i in unique(cumsum.df$dataset)) {
  cat("#####", i, "\n")
  
  protein.cumsumArea <- ggplot(cumsum.df %>% 
                                 dplyr::filter(k=="1" & dataset==i) %>%
                                 ungroup() %>%
                                 dplyr::select(protein, auc) %>%
                                 distinct,
                               aes(x=reorder(protein, auc), y=auc, fill=protein)) +
    geom_bar(stat="identity") +
    theme_cowplot() +
    scale_fill_igv() +
    guides(fill=FALSE) +
    coord_flip()
  
  print(protein.cumsumArea)
  cat("\n\n")
}
lung_lung

tonsil_lung

lung_tonsil

tonsil_tonsil

# Delete objects no longer needed (since there are a lot of big objects in this RMD)
rm(cumsum.df)

Reduced dimensions and clustering results

Reduced Dimension Plots (k=1, Immucan_lung, arcsinh)

Below the UMAP and TSNE of the arcsinh counts for the Immucan lung dataset are shown using k=1. Red indicates a high expression of the particular protein.

# Plot UMAP and TSNE for k=1 and lung data
for (sce in keep(X.lung_lung, function(x){metadata(x)$k=="1"})){
  cat("#####", metadata(sce)$training, "_", metadata(sce)$dataset, "\n")
  
  # Create empty list for plots
  X.redDimlist <- list()
  # Extract for simulated and true results for UMAP and TSNE and plot and
  # colour by celltype
  for (method in c("UMAP", "TSNE")){
    X.redDimlist <- append(X.redDimlist,
                           list(dittoDimPlot(sce,
                                             var="celltype",
                                             reduction.use=method,
                                             size=0.2,
                                             color.panel=celltype.col) +
                                  theme_cowplot() +
                                  theme(legend.position="bottom") +
                                  guides(color=guide_legend(nrow=4, 
                                                            byrow=TRUE,
                                                            override.aes=list(size=3)))))
    
  }
  # Plot simulated and true reduced dim on top of each other
  X.redDimPlot <- plot_grid(plotlist=X.redDimlist,
                            ncol=2, labels=c("UMAP", "TSNE"),
                            label_size=15, hjust=c(-2, -1.5), vjust=1, scale=0.9)
  
  print(X.redDimPlot)
  cat("\n\n")
}
lung _ lung

lung _ lung

Cluster overlap between ground truth and simulated data (arcsinh)

To investigate the clustering results of ground truth and simulated data a bit more, we show the overlap of clusters computed using Rphenograph (run with k=47 clusters) individually and compare which clusters overlap using a heatmap.

# Subset to data tested on lung
X.lung <- keep(X.list, function(x){
  metadata(x)$dataset=="lung"})
names(X.lung) <- lapply(X.lung, function(sce){
  paste(metadata(sce)$training, metadata(sce)$dataset,
        metadata(sce)$k, metadata(sce)$ground_truth, sep="_")
})

# Compute clusters for each dataset using Rphenograph (for ground truth data,
# clusters are computed twice and an ARI score between them is calculated as a 
# reference)
all.clusters <- lapply(X.lung, 
  function(sce){
  mat <- t(assay(sce, "exprs"))
  out1 <- Rphenograph(mat, k=100)
  clusters <- as.vector(membership(out1[[2]])) %>%
    as.data.frame() %>%
    dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=".") %>%
    mutate(dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"),
           k=metadata(sce)$k,
           celltype=colData(sce)$celltype,
           cell=colData(sce)$cell_id) 
  
  if (metadata(sce)$ground_truth=="simulated"){
    out2 <- Rphenograph(mat, k=100)
    max_ari <- ARI(factor(membership(out1[[2]])), factor(membership(out2[[2]])))
    clusters <- clusters %>%
      mutate(max_ari=max_ari)
  } 
  
  clusters
}) %>% bind_rows() %>%
  group_by(k, dataset, cell, celltype) %>%
  summarise_all(na.omit) %>%
  ungroup()


# Define functions to get overview of celltypes per cluster as row and column annotations
get_anno_clusters <- function(clusters, i, d, direction="reverse", which_sim="row"){
  col_anno <- HeatmapAnnotation(celltypes=anno_barplot(table(paste("gt", clusters %>% 
                                                                     dplyr::filter(k==i & dataset==d) %>% 
                                                                     pull(ground_truth)), 
                                                             clusters %>% 
                                                               dplyr::filter(k==i & dataset==d) %>% 
                                                               pull(celltype)), 
                                                       gp=gpar(fill=celltype.col), 
                                                       bar_width=1, 
                                                       height=unit(5, "cm")),
                                show_annotation_name=FALSE)
  row_anno <- HeatmapAnnotation(celltypes=anno_barplot(table(paste("sim", clusters %>% 
                                                                     dplyr::filter(k==i & dataset==d) %>% 
                                                                     pull(simulated)), 
                                                             clusters %>% 
                                                               dplyr::filter(k==i & dataset==d) %>% 
                                                               pull(celltype)), 
                                                       gp=gpar(fill=celltype.col), 
                                                       bar_width=1, 
                                                       width=unit(5, "cm"),
                                                       axis_param=list(direction=direction)),
                                show_annotation_name=FALSE,
                                which=which_sim)
  
  list(col_anno, row_anno)
}


# Plot heatmaps showing overlap of clusters for each dataset and k
for (i in unique(all.clusters$k)){
  cat("##### k =", i, "\n")
  
  plot.new()
  ht_list <- NULL
  for (d in unique(all.clusters$dataset)){
    # Get table of number of cells being in simulated cluster i and ground truth
    # cluster j
    temp.matrix <- table(paste("sim", all.clusters %>% 
                                 dplyr::filter(k==i & dataset==d) %>% 
                                 pull(simulated)), 
                         paste("gt", all.clusters %>% 
                                 dplyr::filter(k==i & dataset==d) %>% 
                                 pull(ground_truth))) %>% 
      as.matrix()
    temp.matrix <- log10(temp.matrix + 10)
    
    # Get barplot annotations of celltypes
    anno.list <- get_anno_clusters(all.clusters, i, d)
    col_anno <- anno.list[[1]]
    row_anno <- anno.list[[2]]
    
    # Create heatmap
    cluster.overlap <- Heatmap(temp.matrix,
                               show_row_dend=FALSE,
                               show_column_dend=FALSE,
                               top_annotation=col_anno,
                               left_annotation=row_anno,
                               col=magma(100),
                               column_title=d,
                               row_names_gp=gpar(fontsize=8),
                               column_names_gp=gpar(fontsize=8),
                               column_title_gp=gpar(fontsize=10),
                               rect_gp=gpar(col="white", lwd=1)) 
    
    # Add heatmap to list
    ht_list <- append(ht_list, 
                      list(grid.grabExpr(draw(cluster.overlap,
                                              heatmap_legend_list=list(
                                                Legend(labels=names(celltype.col), 
                                                       title="Celltypes",
                                                       legend_gp=gpar(fill=celltype.col)))))))
  }
  
  print(plot_grid(plotlist=ht_list, nrow=1))
  cat("\n\n")
}
k = 1

k = 3

Kullback-Leibner divergence of clusters (arcsinh)

We also compare the clusters of the ground truth data to the clsuters of the simulated data using the Kullback-Leibner divergence of the celltypes.

# Plot heatmaps showing overlap of clusters for each dataset and k
for (i in unique(all.clusters$k)){
  cat("##### k =", i, "\n")
  
  plot.new()
  ht_list.kl <- NULL
  for (d in unique(all.clusters$dataset)){
    clusters.filtered <- all.clusters %>%
      dplyr::filter(k==i & dataset==d) %>%
      dplyr::select(-c("k", "dataset", "max_ari"))
    clusters_sim.kl <- table(clusters.filtered %>% 
                               dplyr::pull(celltype), 
                             paste0("sim_", clusters.filtered %>% 
                               dplyr::pull(simulated))) %>%
      scale(center=FALSE, scale=colSums(.))
    clusters_gt.kl <- table(clusters.filtered %>% 
                               dplyr::pull(celltype), 
                             paste0("gt_", clusters.filtered %>% 
                               dplyr::pull(ground_truth))) %>%
      scale(center=FALSE, scale=colSums(.))
    
    clusters.kl <- apply(clusters_sim.kl, 2, function(x){
      apply(clusters_gt.kl, 2, function(y){
        KLD(x, y)$sum.KLD.py.px})
    })
    
    # Get barplot annotations of celltypes
    anno.list <- get_anno_clusters(all.clusters, i, d)
    col_anno <- anno.list[[1]]
    row_anno <- anno.list[[2]]

    # Create heatmap
    clusters.klPlot <- Heatmap(t(clusters.kl),
                           show_row_dend=FALSE,
                           show_column_dend=FALSE,
                           top_annotation=col_anno,
                           left_annotation=row_anno,
                           col=colorRamp2(c(0, 0.1, 5), rev(magma(3))), #rev(magma(100)),
                           column_title=d,
                           row_names_gp=gpar(fontsize=8),
                           column_names_gp=gpar(fontsize=8),
                           column_title_gp=gpar(fontsize=10),
                           rect_gp=gpar(col="white", lwd=1)) 
    
    # Add heatmap to list
    ht_list.kl <- append(ht_list.kl, 
                         list(grid.grabExpr(draw(clusters.klPlot))))
  }
  
  print(plot_grid(plotlist=ht_list.kl, nrow=1))
  cat("\n\n")
}
k = 1

k = 3

Cluster vs mean expression (arcsinh)

Next, we assess clustering by looking at the median marker expression of each protein in each cluster and the compare it to the celltypes of each cluster. Ideally, each cluster has a median protein expression pattern that fits its main celltype.

for (i in unique(all.clusters$k)){
  cat("##### k =", i, "\n")
  
  plot.new()
  ht_list.exprs <- NULL
  for (d in unique(all.clusters$dataset)){
    cluster_gt.exprs <- assay(X.lung[grepl(i, names(X.lung)) &
                                       grepl(d, names(X.lung)) &
                                       grepl("ground_truth", names(X.lung))][[1]], "exprs") %>%
      as.data.frame() %>%
      mutate(protein=rownames(.)) %>%
      melt() %>%
      dplyr::rename(cell=variable, expr=value)
    cluster_gt.exprs <- all.clusters %>% 
      dplyr::filter(k==i & dataset==d) %>%
      merge(cluster_gt.exprs) %>%
      dplyr::select(-c("k", "max_ari", "dataset", "celltype"))
    cluster_gt.exprs <- cluster_gt.exprs %>%
      mutate(ground_truth=paste("gt", ground_truth)) %>%
      group_by(ground_truth, protein) %>%
      summarise(mean_expr=mean(expr)) %>%
      pivot_wider(names_from=ground_truth, values_from=mean_expr) %>%
      column_to_rownames("protein") %>%
      as.matrix
    
    cluster_sim.exprs <- assay(X.lung[grepl(i, names(X.lung)) &
                                       grepl(d, names(X.lung)) &
                                       grepl("simulated", names(X.lung))][[1]], "exprs") %>%
      as.data.frame() %>%
      mutate(protein=rownames(.)) %>%
      melt() %>%
      dplyr::rename(cell=variable, expr=value)
    cluster_sim.exprs <- all.clusters %>% 
      dplyr::filter(k==i & dataset==d) %>%
      merge(cluster_sim.exprs) %>%
      dplyr::select(-c("k", "max_ari", "dataset", "celltype"))
    cluster_sim.exprs <- cluster_sim.exprs %>%
      mutate(simulated=paste("sim", simulated)) %>%
      group_by(simulated, protein) %>%
      summarise(mean_expr=mean(expr)) %>%
      pivot_wider(names_from=simulated, values_from=mean_expr) %>%
      column_to_rownames("protein") %>%
      as.matrix
    
    # Get barplot annotations of celltypes
    anno.list <- get_anno_clusters(all.clusters, i, d, direction="normal", which_sim="column")
    col_anno <- anno.list[[1]]
    row_anno <- anno.list[[2]]
    
    # Create heatmap
    cluster.exprsPlotGt <- Heatmap(cluster_gt.exprs,
                                 show_row_dend=FALSE,
                                 show_column_dend=FALSE,
                                 top_annotation=col_anno,
                                 col=magma(100),
                                 column_title="Ground truth",
                                 row_names_gp=gpar(fontsize=8),
                                 column_names_gp=gpar(fontsize=8),
                                 column_title_gp=gpar(fontsize=10),
                                 rect_gp=gpar(col="white", lwd=1),
                                 heatmap_legend_param=list(title="Ground truth"))
    cluster.exprsPlotSim <- Heatmap(cluster_sim.exprs,
                                 show_row_dend=FALSE,
                                 show_column_dend=FALSE,
                                 top_annotation=row_anno,
                                 col=magma(100),
                                 column_title="Simulated",
                                 row_names_gp=gpar(fontsize=8),
                                 column_names_gp=gpar(fontsize=8),
                                 column_title_gp=gpar(fontsize=10),
                                 rect_gp=gpar(col="white", lwd=1),
                                 heatmap_legend_param=list(title="Simulated"))
    
    # Add heatmap to list
    ht_list.exprs <- append(ht_list.exprs, 
                            list(grid.grabExpr(draw(cluster.exprsPlotGt + cluster.exprsPlotSim,
                                                    heatmap_legend_list=list(
                                                Legend(labels=names(celltype.col), 
                                                       title="Celltypes",
                                                       legend_gp=gpar(fill=celltype.col))),
                                                column_title=d))))
  }
  
  print(plot_grid(plotlist=ht_list.exprs, nrow=1))
  cat("\n\n")
}
k = 1

k = 3

Adjusted random index (ARI) between clusters (arcsinh)

Finally, we compute the adjusted random index (ARI) between the clustering of the ground truth and simulated data to assess the overlap.

# Add position for labels to dataframe for easier annotation in ggplot
all.clusters <- all.clusters %>%
  ungroup() %>%
  mutate(point.x=ifelse(k=="3", -0.25, 0.25) + as.numeric(as.factor(dataset)))

# Create barplot for ARI (cluster overlap)
ari.plot <- ggplot(all.clusters %>%
                    group_by(dataset, k) %>%
                    summarize(ARI=ARI(simulated, ground_truth),
                              point.x=point.x,
                              max_ari=max_ari)) +
  geom_bar(aes(x=dataset, y=ARI, fill=k), position="dodge", stat="identity") +
  geom_point(aes(x=point.x, y=max_ari)) +
  theme_cowplot() +
  scale_fill_npg() +
  coord_flip() +
  ylim(0, 1)

print(ari.plot)

Cluster overlap (arcsinh)

Since the ARI seems to be pretty bad in all cases, we have a look at another metric regarding cluster overlap between ground truth and simulated clusters. For that, we find the closest cluster in the simulated data for each ground truth cluster and compute the Jaccard index of the two.

# Calculate celltype probabilites per cluster
cluster.jaccard <- all.clusters %>%
  group_by(k, dataset, ground_truth, simulated) %>%
  summarise(count=n()) %>%
  group_by(k, dataset, ground_truth) %>%
  dplyr::filter(count==max(count)) %>%
  dplyr::select(-count) %>%
  ungroup()

# Calculate jaccard index between clusters using celltype probabilities
cluster.jaccard$jaccard <- lapply(1:nrow(cluster.jaccard), function(i){
    df.temp <- all.clusters %>%
      dplyr::filter(k==cluster.jaccard$k[i] & 
               dataset==cluster.jaccard$dataset[i] &
               (ground_truth==cluster.jaccard$ground_truth[i] |
                  simulated==cluster.jaccard$simulated[i])) %>%
      mutate(ground_truth=ifelse(ground_truth==cluster.jaccard$ground_truth[i], 1, 0),
             simulated=ifelse(simulated==cluster.jaccard$simulated[i], 1, 0))

    jaccard(df.temp %>%
              pull(ground_truth), 
            df.temp %>%
              pull(simulated))

  }) %>% unlist()
# Plot jaccard index distribution per dataset and k
for (i in unique(cluster.jaccard$k)) {
  cat("##### k =", i, "\n")
  
  # Create empty plot list to append all dataset jaccard distribution plots to
  plot.list <- list()
  for (d in unique(cluster.jaccard$dataset)){
    # Subset to dataset and k of interest
    temp.data <- cluster.jaccard %>%
      dplyr::filter(k==i & dataset==d)
    # Create plot
    cluster.overlapPlot <- ggplot(temp.data, 
                                  aes(x=jaccard)) +
      geom_density(fill=pal_npg("nrc")("1")) +
      theme_cowplot(10) +
      geom_vline(aes(xintercept=mean(jaccard)),
                     linetype="dashed") +
      geom_text(aes(x=mean(jaccard), label=round(mean(jaccard), 2)), 
             y=Inf, vjust=2, hjust=-2) +
      xlim(0, 1)
    
    plot.list <- append(plot.list, list(cluster.overlapPlot))
  }
  
  # Print plot
  print(plot_grid(plotlist=plot.list, nrow=1,
                  labels=unique(cluster.jaccard$dataset),
                  label_size=15, hjust=c(-2, -1.5), vjust=1))
  cat("\n\n")
}
k = 1

k = 3